## Rows: 342,867
## Columns: 30
## $ report_dat <chr> "4/26/2008 11:10:00 AM", "2/12/2009 11:04:00 P...
## $ month <dbl> 4, 2, 3, 7, 6, 3, 12, 4, 8, 8, 8, 9, 1, 4, 8, ...
## $ day <dbl> 26, 12, 16, 22, 23, 22, 28, 16, 8, 31, 21, 27,...
## $ year <dbl> 2008, 2009, 2012, 2009, 2017, 2012, 2012, 2014...
## $ hour <dbl> 11, 23, 11, 19, 18, 13, 13, 12, 14, 2, 20, 16,...
## $ minute <dbl> 10, 4, 30, 30, 59, 12, 39, 23, 0, 40, 44, 13, ...
## $ second <dbl> 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ...
## $ anc <chr> "2C", "1B", "5E", "5A", "1A", "2F", "7C", "2B"...
## $ block <chr> "700 - 723 BLOCK OF 14TH STREET NW", "2390 - 2...
## $ block_group <chr> "005800 1", "003400 2", "008702 1", "009501 1"...
## $ crimetype <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date <chr> "4/26/2008 10:41:00 AM", "2/12/2009 12:00:00 A...
## $ ew <chr> "East", "East", "East", "East", "East", "East"...
## $ method <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 8", "Cluster 3", "Cluster 21", "Clust...
## $ ns <chr> "North", "North", "North", "North", "North", "...
## $ offense <chr> "THEFT/OTHER", "THEFT/OTHER", "THEFT F/AUTO", ...
## $ quad <chr> "Northeast", "Northeast", "Northeast", "Northe...
## $ shift <chr> "DAY", "MIDNIGHT", "DAY", "EVENING", "EVENING"...
## $ start_date <chr> "4/26/2008 10:40:00 AM", "2/12/2009 4:30:00 PM...
## $ voting_precinct <chr> "Precinct 129", "Precinct 37", "Precinct 75", ...
## $ ccn <dbl> 8055446, 9020445, 12036569, 9103366, 17107226,...
## $ census_tract <dbl> 5800, 3400, 8702, 9501, 3000, 4902, 7804, 4300...
## $ district <dbl> 2, 3, 5, 4, 3, 3, 6, 3, 4, 3, 2, 7, 5, 6, 5, 5...
## $ psa <dbl> 207, 306, 502, 405, 302, 308, 602, 301, 401, 3...
## $ ward <dbl> 2, 1, 5, 5, 1, 2, 7, 2, 4, 1, 2, 8, 5, 8, 5, 5...
## $ x <dbl> 60706, 295890, 121333, 114544, 323805, 48697, ...
## $ x1 <dbl> 60706, 295890, 121333, 114544, 323805, 48697, ...
## $ xblock <dbl> -77.03196, -77.02129, -76.99953, -77.00746, -7...
## $ yblock <dbl> 38.89885, 38.92368, 38.91687, 38.94822, 38.929...
#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
mutate(
report_dat = anytime::anydate(report_dat),
#start_date = anytime::anydate(start_date),
#end_date = anytime::anydate(end_date),
across(where(is.character), factor),
census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
ward = factor(ward, levels = a$ward %>% unique %>% sort),
psa = factor(psa, levels = a$psa %>% unique %>% sort),
year = factor(year, levels = a$year %>% unique %>% sort),
month = factor(month, levels = a$month %>% unique %>% sort),
day = factor(day, levels = a$day %>% unique %>% sort),
hour = factor(hour, levels = a$hour %>% unique %>% sort)
) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
month, day, year, hour,
where(is.character),
where(is.factor),
where(is.numeric)
) %>% arrange(report_dat, crimetype, offense)
abak = a
#saveRDS(abak, 'cleaned_data_10yrs.RDS')
#a = abakNotes
## Rows: 10,594
## Columns: 12
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day <fct> 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, ...
## $ year <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour <fct> 3, 13, 8, 21, 16, 20, 20, 7, 11, 3, 8, 13, 8, 21, 17, ...
## $ census_tract <fct> 7809, 9204, 10600, 7603, 3301, 10200, 7803, 9810, 1090...
## $ crimetype <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense <fct> BURGLARY, MOTOR VEHICLE THEFT, THEFT F/AUTO, MOTOR VEH...
## $ psa <fct> 608, 502, 501, 606, 501, 105, 602, 708, 708, 403, 107,...
## $ quad <fct> Northeast, Northeast, Northeast, Southeast, Northeast,...
## $ shift <fct> MIDNIGHT, DAY, DAY, EVENING, EVENING, EVENING, EVENING...
## $ ward <fct> 7, 5, 6, 7, 5, 6, 7, 8, 8, 4, 6, 8, 5, 1, 1, 6, 6, 1, ...
#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)
a %>% select(where(is.factor)) %>% map_df(n_unique)## $month
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
##
## $day
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
##
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
##
## $hour
## [1] 3 13 8 21 16 20 7 11 17 14 9 10 4 18 12 15 2 0 19 6 22 1 23 5
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
##
## $census_tract
## [1] 7809 9204 10600 7603 3301 10200 7803 9810 10900 2002 8200 7504
## [13] 8803 2802 3000 8410 3200 5201 2702 9604 9505 9504 11100 9102
## [25] 4002 9700 4802 6600 4400 11000 4600 9803 502 600 8701 9507
## [37] 6202 3700 5800 2701 7502 5002 9503 9903 9811 2202 10400 4801
## [49] 4100 8804 8904 9603 8903 9203 3100 1702 5600 9509 10100 4902
## [61] 7304 1001 7709 202 8100 8402 5500 1901 2900 8302 9601 6801
## [73] 2101 9400 9301 7409 7708 4702 3400 6400 7707 400 5001 7401
## [85] 4202 10700 2502 6500 1600 2102 4300 7403 7200 9602 100 7404
## [97] 3800 3600 9802 8001 9904 5900 7703 2600 3302 8802 7100 9901
## [109] 2801 300 501 7901 7406 2501 7503 8301 1100 9907 10300 7407
## [121] 1002 4901 9807 7605 8002 9508 9501 7408 5301 1402 4001 901
## [133] 3500 8702 7000 2201 10800 2400 7604 1804 10500 9000 9302 1302
## [145] 7808 1803 9905 1200 7601 9804 2001 9201 7804 4201 9906 9902
## [157] 9801 4701 7806 702 3900 802 701 7807 6802 6900 2302 6804
## [169] 2301 7903 201 1401 801 1301 6700 1902 7301 1500 902
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
##
## $crimetype
## [1] Non-Violent Violent
## Levels: Non-Violent Violent
##
## $method
## [1] OTHERS GUN KNIFE
## Levels: GUN KNIFE OTHERS
##
## $offense
## [1] BURGLARY MOTOR VEHICLE THEFT
## [3] THEFT F/AUTO THEFT/OTHER
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY
## [7] SEX ABUSE ARSON
## [9] HOMICIDE
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
##
## $psa
## [1] 608 502 501 606 105 602 708 403 107 701 506 302 104 409 307 408 603 406 504
## [20] 503 303 706 308 305 707 204 404 102 304 101 702 405 103 505 306 208 507 401
## [39] 207 705 202 605 206 402 601 108 704 703 301 203 106 604 201 407 607 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
##
## $quad
## [1] Northeast Southeast Northwest
## Levels: Northeast Northwest Southeast
##
## $shift
## [1] MIDNIGHT DAY EVENING
## Levels: DAY EVENING MIDNIGHT
##
## $ward
## [1] 7 5 6 8 4 1 2 3
## Levels: 1 2 3 4 5 6 7 8
a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories
Observations:
a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')## 2 columns ignored with more than 50 categories.
## census_tract: 174 categories
## psa: 56 categories
Observations relative to Historic
ggplotly(
a %>% filter(year != 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations:
ggplotly(
a %>% filter(year == 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations relative to Historic
# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
monthkey = lubridate::make_datetime(
as.numeric(as.character(year)),
as.numeric(as.character(month)),
1)
) %>% relocate(report_dat, monthkey, everything())#Total Offenses over time
a %>% group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
ungroup() %>%
plot_ly(x = ~monthkey, y =~total.offenses) %>%
add_lines(size = I(3)) %>% layout(
xaxis = list(title = ''),
yaxis = list(title = ''),
title = 'Total Offenses by Month'
)Observations
ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = offense)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = ward)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) ## Rows: 10,565
## Columns: 2
## $ xblock <dbl> -76.92713, -76.99716, -77.00740, -76.94992, -77.01060, -77.0...
## $ yblock <dbl> 38.90078, 38.92093, 38.90645, 38.86128, 38.92023, 38.87786, ...
a %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward) %>% add_markers() %>% layout(title = 'WDC Wards')a %>% plot_ly(x = ~xblock, y = ~yblock, color = ~census_tract) %>% add_markers() %>% layout(title = 'WDC Wards Census Tracts') %>% hide_legend()## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## == Use anomalize to improve your Forecasts by 50%! ==============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
a.anomalize = a %>%
group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
time_recompose()## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
filter(monthkey < as.Date('2017-11-01')) %>%
plot_anomalies(
ncol = 1,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 1.5,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'total.offenses', title = 'total.offenses')
)a %>% count(monthkey, crimetype, name = 'count') %>%
select(-monthkey) %>%
plot_ly(y = ~crimetype, x = ~count, color = ~crimetype) %>%
add_boxplot() %>%
layout(
title = 'Daily Crime Cases by Crime Type ',
xaxis = list(title ='Roughly 5 Year Distribution'),
yaxis = list(title ='')
)## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
library(prophet)
#renaming cols to prophet's col conventions
a.agg = a %>% filter(crimetype == 'Violent') %>%
group_by(report_dat = round_date(report_dat,'day')) %>%
summarise(total.offenses = n()) %>%
select(ds = report_dat, y = total.offenses)
#creating model
a.agg.model = a.agg %>% prophet()
#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
periods = 28, #next 4 wks
freq = 'day')
#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)
#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.